@InProceedings{LopesOlivAlmeAraś:2009:SpFrBa,
author = "Lopes, Ana Paula Brand{\~a}o and Oliveira, Rodrigo Silva and
Almeida, Jussara Marques de and Ara{\'u}jo, Arnaldo de
Albuquerque",
affiliation = "{Federal University of Minas Gerais/State University of Santa
Cruz} and {Federal University of Minas Gerais} and {Federal
University of Minas Gerais} and {Federal University of Minas
Gerais}",
title = "Spatio-Temporal Frames in a Bag-of-visual-features Approach for
Human Actions Recognition",
booktitle = "Proceedings...",
year = "2009",
editor = "Nonato, Luis Gustavo and Scharcanski, Jacob",
organization = "Brazilian Symposium on Computer Graphics and Image Processing, 22.
(SIBGRAPI)",
publisher = "IEEE Computer Society",
address = "Los Alamitos",
keywords = "Human Actions, Bag-of-Visual-Features, Video classification.",
abstract = "The recognition of human actions from videos has several
interesting and important applications, and a vast amount of
different approaches has been proposed for this task in different
settings. Such approaches can be broadly categorized in
model-based and model-free. Typically, model-based approaches work
only in very constrained settings, and because of that, a number
of model-free approaches appeared in the last years. Among them,
those based in bag-of-visual-features (BoVF) have been proving to
be the most consistently successful, being used by several
independent authors. For videos to be represented by BoVFs,
though, an important issue that arises is how to represent dynamic
information. Most existing proposals consider the video as a
spatio-temporal volume and then describe volumetric patches around
3D interest points. In this work, we propose to build a BoVF
representation for videos by collecting 2D interest points
directly. The basic idea is to gather such points not only from
the traditional frames (xy planes), but also from those planes
along the time axis, which we call the spatio-temporal frames. Our
assumption is that such features are able to capture dynamic
information from the videos, and are therefore well-suited to
recognize human actions from them, without the need of 3D
extensions for the descriptors. In our experiments, this approach
achieved state-of-the-art recognition rates on a well-known human
actions database, even when compared to more sophisticated
schemes.",
conference-location = "Rio de Janeiro, RJ, Brazil",
conference-year = "11-14 Oct. 2009",
doi = "10.1109/SIBGRAPI.2009.17",
url = "http://dx.doi.org/10.1109/SIBGRAPI.2009.17",
language = "en",
ibi = "8JMKD3MGPBW4/35THUHP",
url = "http://urlib.net/ibi/8JMKD3MGPBW4/35THUHP",
targetfile = "sibgrapi-actions-2009-FINAL-5-no-bookmarks.pdf",
urlaccessdate = "2024, Apr. 28"
}